rm(list = ls())
cat("\014")
## set up some global options
# always set stringsAsFactors = F when loading data
options(stringsAsFactors=FALSE)
# show the code
knitr::opts_chunk$set(echo = TRUE)
# define all knitr tables to be html format
options(knitr.table.format = 'html')
# change code chunk default to not show warnings or messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
libs <- c("tidyverse", "knitr", "readxl", "dismo", "leaflet", "kableExtra", "farmr")
#"NbClust", "clValid", "ggfortify", "clustree", "dendextend", "FactoMineR", "corrplot", "GGally"
lapply(libs, require, character.only = T, quietly = T, warn.conflicts = F)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Warning: package 'readxl' was built under R version 3.4.4
## Warning: package 'sp' was built under R version 3.4.4
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:tidyr':
##
## extract
## Warning: package 'leaflet' was built under R version 3.4.4
## Warning: package 'kableExtra' was built under R version 3.4.4
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
#### define helpful functions
# define function to adjust table widths
html_table_width <- function(kable_output, width) {
width_html <- paste0(paste0('<col width="', width, '">'), collapse = "\n")
sub("<table>", paste0("<table>\n", width_html), kable_output)
}
source("../../oaflib/commcareExport.R")
##
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
##
## parse_date
## Warning: package 'jsonlite' was built under R version 3.4.4
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
source("../../oaflib/misc.R")
source("../../oaflib/plm.R")
select <- dplyr::select
forceUpdateAll <- FALSE
This file will pick up on from the Kenya round 1 file and finish the analysis. The cleaning of the data up through the round 1 data (baseline and round 1) happens in the kenya_round_1 folder. Going forward, the cleaning of new data will happen in the shs_cleaning.R file, which I’ll source here, and then the analysis will happen here.
Analyze the soil health study data.
# load the latest kenya data from the cleaning file >> or maybe do the combining there.
#source("shs_cleaning.R")
keDat <- readRDS("ke_cleaned_combined_fieldDat.rds")
rwDat <- readRDS("rw_cleaned_combined_fieldDat.rds")
It’s hard to know where to draw the line between cleaning and analysis. I’m going to keep this code here even though it’s technically additional data cleaning
Check that we’re actually dealing with clients for which we have multiple records
table(table(rwDat$sample_id)==5)
##
## FALSE TRUE
## 157 2328
So we have some clients for which we don’t have 3 records, one for each survey. Find out what the deal is.
table(table(rwDat$sample_id))
##
## 1 2 3 4 5
## 1 6 110 40 2328
Hm. So the numbers less than 5 could be that we’re just not finding people every year. But the 1s and 2s are particularly strange. Let’s check those out. Here’s a helpful link
singleSurvey <- names(which(table(rwDat$sample_id) == 1))
rwDat %>%
filter(sample_id == singleSurvey)
Looks like this person was only surveyed in the first season. Okay. Check out the other seasons
twoSurveys <- names(which(table(rwDat$sample_id) == 2))
rwDat %>%
filter(sample_id %in% twoSurveys)
Okay, these are from the recent round but they don’t have any previous surveys which is odd. We shouldn’t be adding new farmers into the survey. These can be dropped for simplicity. I can follow up with Eric and Cyprien to find out the issue but I’m just going to drop them for now.
rwDat <- rwDat %>%
filter(!sample_id %in% twoSurveys)
fourSurveys <- names(which(table(rwDat$sample_id)==4))
rwDat %>%
filter(sample_id %in% fourSurveys)
Okay. There are enough surveys in the 3 and 4 category that they’re at least plausible from a data quality and survey strategy perspective. I’ll leave them as they are.
Check that there isn’t something systematic in these surveys. If so, describe what it is.
Create a record of how many farmers are joining and leaving Tubura between the baseline through the current survey round. These numbers only reflect the changes from the last round.
This also should account for clients that we miss in each season so we can say what the attrition has been season to season. Also, why don’t we have 16A results for Rwanda. Find out what’s happening there.
clientCount <- function(dat){
# this assumes season, d_client, and sample_id remain the names of
# the key variables. They should for the data to match.
# this funciton tablulates the number of clients in each season
iniTab <- dat %>%
filter(!is.na(d_client)) %>%
group_by(season, d_client) %>%
tally() %>%
gather(key, value, -c(season, d_client)) %>%
spread(d_client, value) %>%
rename(control = `0`,
client = `1`) %>%
mutate(control = as.numeric(control),
client = as.numeric(client)) %>%
rowwise() %>%
mutate(total = sum(control, client)) %>%
ungroup()
percentDiff <- iniTab %>%
mutate(pct_change = (((total - lead(total))/total) * 100))
return(percentDiff)
}
rw.attrition.tab <- clientCount(rwDat)
ke.attrition.tab <- clientCount(keDat)
attritionRate <- function(org, new){
return((org - new)/org * 100)
}
attritionRate(2028, 1935) # kenya
## [1] 4.585799
attritionRate(2439, 2409) # rwanda
## [1] 1.230012
attritionRate(4467, 4344) # total
## [1] 2.753526
ggplot(rwDat, aes(x = season, y = ph, group = d_client)) + geom_boxplot()
ggplot(keDat, aes(x = can.acre)) +
geom_histogram() +
scale_x_continuous(limits = c(0,200)) +
labs(title = "Set limits to 0 and 100 for CAN")
ggplot(keDat, aes(x = dap.acre)) +
geom_histogram() +
scale_x_continuous(limits = c(0,200)) +
labs(title = "Set limits to 0 and 100 for DAP")
#export data for Diego >> this is the easiest way to have the location and soil information together. The soil data is also available through the warehouse but it isn't easily connected to the survey information.
rwDat %>%
select(sample_id, season, district, cell_field, village, ph, calcium, magnesium, organic.carbon, total.nitrogen) %>%
write_csv(., "output/rwanda_soil_values_travertine_impact.csv")
Check values for upcoming final round of Kenya SHS data collection
This is ideally a grouped boxplot. Fix this when I have internet. This would show the values by season and client type.
See sketch of SHS report. Remember that sameStatus are the farmers that kept their status between baseline and endline. The two models of interest are:
Helpful link for executing code in parallel
Work before 3/18/19
Before running the full models, I’m going to take a closer look at the nitrogen data to make sure we can explain the negative effects we’re seeing in Kenya in the models below. There isn’t a clear agronomic explanation for the negative effect and as a consequence we’re put in a bind in our reporting to explain either the outcome or the modeling.
Let’s start with looking at the relationship between nitrogen and other features to make sure we’re seeing what expect. Here are the notes from Step:
Click here for compost / acre vs. nitrogen rate effects. The short is that there isn’t a clear trend.
Nitrogen application by yield output. The yield values are not making sense between survey rounds. I’ll come back to this.
Look at rotations relative to N rate by 1AF status. It does appear that plots that non-1AF plots receive more legume intercrops. This makes sense given that the 1AF core package emphasizes maize monocrops.
Would love to hear your thoughts - entirely possible I’m missing something here. The most critical question here is whether (i) we’re not applying enough C relative to N, (ii) we’re just not applying enough N. I suspect maybe the answer is both, but I’m wary of assuming it’s the latter and making things worse if the key driver is unbalanced C:N ratios.
3/18/19 forward
Step Aston of ART wants follow up on the possibility that 1AF is driving down N rates through repeated 1AF culti ation. This document lists the key questions we want to answer to understand what might be happening in the data. Those questions are (see document for more specifics):
The big takeaway for me from this plot is that there are regular spikes at certain values. That seems odd for a system that should be pretty continuous. Let’s see if that plays out evenly across seasons.
keDat %>%
ggplot(., aes(x = total.nitrogen)) +
geom_histogram(bins = 100) +
geom_density()
keDat %>%
ggplot(., aes(x = total.nitrogen)) +
geom_histogram() +
geom_vline(xintercept = mean(keDat$total.nitrogen, na.rm = T)) +
facet_grid(~ season)
# ggplot() +
# geom_histogram(data = filter(keDat, keDat$season == 2015), aes(x = total.nitrogen, fill = 'red'), alpha = 0.5) +
# geom_histogram(data = filter(keDat, keDat$season == 2016), aes(x = total.nitrogen, fill = 'blue'), alpha = 0.5) +
# geom_histogram(data = filter(keDat, keDat$season == 2017), aes(x = total.nitrogen, fill = 'green'), alpha = 0.5) +
# theme(legend.position = 'none')
This is what we’re observing. There’s a clear downward trend across seasons with perhaps a slight shift downward for
keDat %>%
#select(season, total.nitrogen, d_client) %>%
ggplot(., aes(x = season, y = total.nitrogen, fill = as.factor(d_client))) +
geom_boxplot() +
theme(legend.position = 'bottom') +
labs(title = "Total N by season and client status", subtitle ="Strong seasonal trend with perhaps a slight client trend downward",
x = "Season", y = "Total N", fill = "Client Status")
keDat %>%
filter(!is.na(d_client)) %>%
group_by(season, d_client) %>%
summarize(mean = round(mean(total.nitrogen, na.rm = T),4)) %>%
kable(caption = "Downward average N by season") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| season | d_client | mean |
|---|---|---|
| 2015 | 0 | 0.1577 |
| 2015 | 1 | 0.1579 |
| 2016 | 0 | 0.1499 |
| 2016 | 1 | 0.1484 |
| 2017 | 0 | 0.1478 |
| 2017 | 1 | 0.1466 |
1AF clients are slightly below comparison farmers in terms of total nitrogren but the difference is ever so slight. Perhaps in the context of soil chemistry these are big differencs though!
This code also addresses identified erroneous variables.
keDat <- keDat %>%
mutate(organic.carbon = ifelse(organic.carbon == 0 & total.nitrogen > 0, NA, organic.carbon))
keDat %>%
ggplot(., aes(x = organic.carbon, y = total.nitrogen, color = as.factor(d_client))) +
geom_point() +
labs(title = "N vs. C - what we'd expect", x = "Total Carbon", y = "Total Nitrogen",
subtitle = "And no clear client, non-client trend", color = "1AF client")
keDat %>%
filter(!is.na(d_client)) %>%
ggplot(., aes(x = organic.carbon, y = total.nitrogen, color = as.factor(d_client))) +
geom_point() +
labs(title = "N vs. C by season - again what we'd expect", x = "Organic C", y = "Total N", color = "1AF client",
subtitle = "and no clear client, non-client trend") +
facet_grid(~ season)
This is the same table as above but shows carbon level by season and client status. We don’t see the same gap widening between client and non-client plots that we see with nitrogen.
keDat %>%
filter(!is.na(d_client)) %>%
group_by(season, d_client) %>%
summarize(mean = round(mean(organic.carbon, na.rm = T),4)) %>%
kable(caption = "Carbon levels trend together by client status over seasons") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| season | d_client | mean |
|---|---|---|
| 2015 | 0 | 2.1606 |
| 2015 | 1 | 2.1700 |
| 2016 | 0 | 1.8588 |
| 2016 | 1 | 1.8506 |
| 2017 | 0 | 1.8625 |
| 2017 | 1 | 1.8632 |
if you applied compost since there are so many 0s and removing super large values
There’s no clear visual trend between compost application and nitrogen rates.
keDat %>%
filter(compost.acre > 0 & compost.acre < 1000) %>%
ggplot(., aes(x = compost.acre, y = total.nitrogen, color = as.factor(d_client))) +
geom_point() +
facet_grid(~ season) +
labs(title = "N vs. compost", subtitle = "Relationship less clear", x = "Compost kg / acre", y = "")
# dap 18% n, can 26%
keDat %>%
mutate(can_n = can.acre * 0.26,
dap_n = dap.acre * 0.18) %>%
rowwise() %>%
mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
filter(total_n_applied > 0) %>%
ggplot(., aes(x = total_n_applied, y = total.nitrogen, color = as.factor(d_client))) +
geom_point() +
facet_wrap(~ season + d_client)
Interpretation: There doesn’t seem to be a strong relationship between total N applied and total nitrogen in the soil. It’s not clear to me what we should have expected from this relationship but in general I don’t see anything odd or a clear difference between the 1AF plots and the treatment plots.
unique_seasons <- unique(keDat$season)
nitrogen_v_compost <- lapply(unique_seasons, function(year){
return(keDat %>%
filter(season == year) %>%
mutate(can_n = can.acre * 0.26,
dap_n = dap.acre * 0.18) %>%
rowwise() %>%
mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
filter(total_n_applied > 0)) %>%
filter(compost.acre < 2000 & compost.acre > 0)
})
lapply(nitrogen_v_compost, function(x){
ggplot(x, aes(x = total_n_applied, y = compost.acre, color = as.factor(d_client))) +
geom_point() +
#facet_wrap(~ d_client, scales = 'free') +
geom_smooth(method = 'lm') +
labs(title = paste("Total N applied / acre vs. compost / acre in", unique(x$season)),
subtitle = "Compost / acre less than 2000 kgs",
x = "Total kg N / acre",
y = "Total kg compost / acre")
})
## [[1]]
##
## [[2]]
##
## [[3]]
Interpretation I don’t see a strong difference across seasons between total N applied and total compost applied. The trend lines seem generally the same or at least visually not so strong to suggest that 1AF plots and control plots are having different experiences. The 2017 plot indicates a slightly flatter relationship between applied N and applied C on 1AF plots but this is not super dramatic.
applied_c_vs_soil_n <- lapply(unique_seasons, function(year){
return(keDat %>%
filter(season == year) %>%
filter(compost.acre < 2000 & compost.acre > 0))
})
lapply(applied_c_vs_soil_n, function(x){
ggplot(x, aes(x = compost.acre, y = total.nitrogen, color = as.factor(d_client))) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = paste("Applied C vs. soil N in", unique(x$season)),
subtitle = "No clear difference",
x = "Total kg compost / acre",
y = "Total soil N (%))")
})
## [[1]]
##
## [[2]]
##
## [[3]]
Interpretation As with the applied N vs. applied C, I don’t see a clear difference in outcome between 1AF and control plots when we look at applied C and soil N.
intercrop_check <- keDat %>%
mutate(legume_intercrop = ifelse(intercrop %in% c("beans", "groundnuts", "soya"), 1, 0)) %>%
filter(!is.na(sample_id))
intercrop_check %>%
group_by(sample_id, d_client) %>%
summarize(legume_count = sum(legume_intercrop, na.rm = T),
total.nitrogen.average = mean(total.nitrogen, na.rm = T)) %>%
filter(legume_count <= 3) %>%
ggplot(., aes(x = as.factor(legume_count), y = total.nitrogen.average, fill = as.factor(d_client))) +
geom_boxplot() +
labs(title = "Legume intercropping (beans, groundnuts, soya) and nitrogen",
subtitle = "Control plots have slightly higher N across years of legume intercrops",
x = "Number of seasons of legume intercrops",
y = "Total soil N (%)", fill = "1AF status") +
theme_oaf()
The above plot shows N rates by the number of legume intercrops. It’s not a perfect view of what happens following a season in which a farmer rotates to beans but does show the cumulative benefit of additional seasons of a legume intercrop on the plot. Do 1AF farmers have fewer intercrops? This is hard to precisely pin down since the plots can shift from 1AF cultivation to not-1AF cultivation season to season. Therefore I’ll look at the average years of 1AF cultivation vs. legume intercrops
intercrop_check %>%
group_by(sample_id) %>%
summarize(years_oaf = sum(d_client, na.rm = T),
number_intercrops = sum(legume_intercrop, na.rm = T)) %>%
group_by(years_oaf, number_intercrops) %>%
tally() %>%
filter(years_oaf <= 3) %>%
filter(number_intercrops < 4) %>%
ggplot(., aes(x = as.factor(years_oaf), y = n, fill = as.factor(number_intercrops))) +
geom_bar(stat = 'identity', position = 'dodge') +
labs(title = "Legume intercrops by 1AF tenure", x = "Years of cultivation under 1AF", y = "Count of plots",
subtitle = "") +
theme(legend.position = 'bottom') +
theme_oaf()
We’re looking for an indication that 1AF plots have fewer legume intercrops and it does appear that plots that have not been cultivated with 1AF have more years of legume intercropping (the blue bar on the left) that for plots that are 1AF plots for more seasons. This relationship is not necessarily statistically different as we also see years of numerous legume intercrops for 1AF plots as well. The main distinguishing feature however is the big bar indicating that plots that have never been 1AF plots have had 3 legume intercrops more than 1AF plots.
library(broom)
legume_model_data <- intercrop_check %>%
group_by(sample_id) %>%
summarize(years_oaf = sum(d_client, na.rm = T),
number_intercrops = sum(legume_intercrop, na.rm = T),
average_soil_nitrogen = mean(total.nitrogen, na.rm = T))
# plot these data to understand relationship.
legume_model_data %>%
filter(number_intercrops < 4) %>%
ggplot(., aes(x = as.factor(years_oaf), y = average_soil_nitrogen, fill = as.factor(number_intercrops))) +
geom_boxplot() +
theme_oaf() +
labs(title = "Soil N by 1AF seasons and legume cultivations",
x = "1AF seasons",
y = "Average soil N",
fill = "Legume cultivations")
cat("R squard for simple model:", summary(lm(average_soil_nitrogen ~ as.factor(number_intercrops), data = legume_model_data))$r.squared)
## R squard for simple model: 0.0056116
cat("R squard for model including years_oaf:", summary(lm(average_soil_nitrogen ~ as.factor(number_intercrops) + years_oaf, data = legume_model_data))$r.squared)
## R squard for model including years_oaf: 0.005687464
The r2 of models looking at the simple relationship of years of legume cultivation and soil nitrogen is very low. That doesn’t seem to be capturing the movement of soil nitrogen.
intercrop_check %>%
filter(legume_intercrop == 1) %>%
group_by(intercrop, d_client) %>%
tally() %>%
ggplot(., aes(x = intercrop, y = n, fill = as.factor(d_client))) +
geom_bar(stat = 'identity', position = 'dodge') +
labs(title = "Legume intercrop type by 1AF status",
subtitle = "It's mostly beans, folks and control plots are intercropped a lot more regularly",
x = "Intercrop type",
y = "Number of plots",
fill = "1AF status") +
theme_oaf()
Interpretation: About 400 more control plots were planted with a bean intercrop over the course of the study. However, we don’t see additional seasons of legume intercrop explaining soil N levels in the data.
Does increased intercropping explain the difference between 1AF and control plots? Do the other pieces of evidence line up to support this?
intercrop_check %>%
filter(legume_intercrop == 1) %>%
group_by(intercrop, d_client) %>%
tally() %>%
mutate(d_client = ifelse(d_client == 0, "control", "one-acre")) %>%
spread(d_client, n) %>%
kable(., caption = "Total plots by intercrop type") %>%
kable_styling()
| intercrop | control | one-acre |
|---|---|---|
| beans | 1835 | 1452 |
| groundnuts | 57 | 27 |
| soya | 15 | 6 |
Control plots were intercropped with beans about ~400 times more than 1AF plots which is not nothing.
intercrop_check %>%
filter(legume_intercrop == 1) %>%
group_by(intercrop, d_client) %>%
tally() %>%
ungroup() %>%
mutate(percent = paste(round(n / sum(n), 2) * 100, "%")) %>%
select(-n) %>%
mutate(d_client = ifelse(d_client == 0, "control", "one-acre")) %>%
spread(d_client, percent) %>%
kable(., caption = "Percent of plots by intercrop") %>%
kable_styling()
| intercrop | control | one-acre |
|---|---|---|
| beans | 54 % | 43 % |
| groundnuts | 2 % | 1 % |
| soya | 0 % | 0 % |
Clean up region variable
keDat$region <- ifelse(grepl("west", tolower(keDat$region)), "Western", "Nyanza")
I want to create a metric of how different treatment values are from control values by district. I’m going to subtract the control average at the district level from each treatment value for each season. I’ll then plot them individually so it’s easier to see.
control_nitrogen_level <- function(year){
return(keDat %>%
filter(!is.na(district)) %>%
filter(season == year) %>%
group_by(district) %>%
mutate(control_nitrogen_average = mean(total.nitrogen[d_client == 0], na.rm = T)) %>%
ungroup() %>%
mutate(difference_nitrogen = total.nitrogen - control_nitrogen_average) %>%
filter(d_client == 1))
}
nitrogen_plots <- lapply(c(2015, 2016, 2017), function(year){
control_nitrogen_level(year)
})
lapply(nitrogen_plots, function(x){
ggplot(x, aes(x = district, y = difference_nitrogen, fill = region)) +
geom_boxplot() +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("Difference in nitrogen ", unique(x$season)),
subtitle = "I don't see a clear spatial trend across locations or time") +
theme_oaf()
})
## [[1]]
##
## [[2]]
##
## [[3]]
Quickly, let’s output graphs for each district:
unique_districts <- unique(keDat$district)[!unique(keDat$district) %in% NA]
lapply(unique_districts, function(dist){
keDat %>%
filter(!is.na(district)) %>%
filter(district == dist) %>%
ggplot(., aes(x = season, y = total.nitrogen, fill = as.factor(d_client))) +
geom_boxplot() +
labs(title = paste("Seasonal trend for", dist),
x = "Season",
y = "Total soil N (%)") +
theme_oaf()
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
When we look at this by district over time and 1AF plot status we definitely see a downward trend in some districts. We also see that in some cases, like Butere, the 1AF plots started well below controls. In ohters, like Suneka, we see the 1AF drop dramatically which is pretty difficult to explain. A couple other observations:
Farmers could be shifting plots with less N into 1AF cultivation and moving other plots out of 1AF rotation. Let’s check out how these trends look if we use the baseline assignment as our indicator for 1AF / control status.
When we look at the data based on the 2015 baseline treatment status, the difference between 1AF and control plots isn’t as extreme. The analytical model we’re using below should be taking into acocunt the effect of clients potentially switching their plot back and forth.
keDat %>%
ggplot(., aes(x = region, y = total.nitrogen)) +
geom_boxplot() +
facet_grid(~ season)
keDat %>%
filter(!is.na(district)) %>%
ggplot(., aes(x = district, y = total.nitrogen)) +
geom_boxplot() +
facet_grid(season ~ .) +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1))
Making maps will be more involved so I’m hoping the boxplots are enough.
Assess whether ratio of N application rates vs N uptake rates is different between treatment and control. We’d have to make some assumptions about
Quick histogram of farmer estimated yields
ggplot(keDat, aes(x = yield.t.ha)) +
geom_histogram(binwidth = 0.25) +
scale_x_continuous(limit = c(0, 20)) +
labs(title = "Histogram of farmer est. yields (less than 20 t/ha)",
subtitle = "Esitmates are very noisy. 75th percentile is 15 t/ha")
#kable(quantile(keDat$yield.t.ha, na.rm = T), caption = "Most observations are less than 15 t/ha which is still really high")
applied_n_vs_yield <- lapply(unique_seasons, function(year){
return(
keDat %>%
filter(season == year) %>%
mutate(can_n = can.acre * 0.26,
dap_n = dap.acre * 0.18) %>%
rowwise() %>%
mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
mutate(total_n_applied_ha = total_n_applied * 2.47) %>%
filter(total_n_applied > 0) %>%
filter(yield.t.ha < 10)) %>%
as.data.frame()
})
lapply(applied_n_vs_yield, function(x){
ggplot(x, aes(x = total_n_applied_ha, y = yield.t.ha, color = as.factor(d_client))) +
geom_point() +
facet_wrap(~ season, scales = 'free') +
geom_smooth(method = 'lm') +
labs(title = paste("Applied N vs. farmer estimated yield", unique(x$season)),
subtitle = "Limiting farmer est. yield to < 10 t/ha - Trend lines are messy but suggest no difference",
x = "Total kg N / ha ",
y = "Yield t/ha",
fill = "1AF status")
})
## [[1]]
##
## [[2]]
##
## [[3]]
N applied as manure instead of plant material. Let’s look at compost by type by client status by season
keDat %>%
mutate(compost_category = ifelse(grepl("pig|goat|cow|chicken", compost.type), "animal", "plant")) %>%
filter(compost.kgs > 0) %>%
filter(compost.acre < 2000) %>%
ggplot(., aes(x = as.factor(compost_category), y = compost.acre, fill = as.factor(d_client))) +
geom_boxplot() +
facet_wrap(~ season) +
labs(title = "Compost application by type (excluding those that didn't apply)",
subtitle = "1AF plots are getting more animal manure when compost is applied",
x = "Compost type - animal or plant (plant residue / kitchen)",
y = "Total kg compost / acre",
fill = "1AF status")
Interpretation If anything this suggests that 1AF plots are getting more N rich compost. Unless I’m misunderstanding the consequence here, this should be contributing to N rates in the soil, not diminishing soil N.
keySoilVars <- c("ph", "calcium", "total.nitrogen", "organic.carbon", "magnesium")
keDat <- keDat %>% mutate(age2 = age^2)
indFeList <- list("as.factor(d_client)",
c("as.factor(d_client)", "as.factor(sample_id)"),
c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)"),
c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)", "age", "age2"))
# run this in parallel to speed up the process
# load the data and variables and packages into the cluster
regFileKe <- "regFile_through2017.RData"
forceUpdate <- forceUpdateAll
if(!file.exists(regFileKe) || forceUpdate) {
library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type="FORK")
clusterEvalQ(cl, "plm")
clusterExport(cl, "keDat")
clusterExport(cl, "keySoilVars")
clusterExport(cl, "indFeList")
indFeLoopKe <- parLapply(cl, indFeList, function(mod){
lapply(keySoilVars, function(outcome){
form = lm(reformulate(termlabels = mod, response = outcome), data=keDat)
pdf(file=paste("output/ke2017/", paste0(outcome, paste(mod, collapse = "")), ".pdf", sep = ""))
print(plot(form))
dev.off()
form = plm(form, c("sample_id", "age", "age2"))
rownames(form) = paste(rownames(form), outcome, sep = " ")
return(form)
})
})
stopCluster(cl)
save(indFeLoopKe, file=regFileKe)
} else {
load(regFileKe)
}
And combine model outputs into tables for each model
modExportKe <- lapply(indFeLoopKe, function(models){
do.call(rbind, models)
})
for(i in 1:length(modExportKe)){
write.csv(modExportKe[i], file=paste0("output/ke2017/","regOutput_", i, ".csv"), row.names = T)
}
In the individual fixed effect model above, the naive model would only include a client indicator and individual fixed effects. If we add season, we lose significance on almost everything. I’d guess that as we add more likely controls we additionally lose significance. I’ve included age and age squared along the lines of Hicks et.al.
finalModelKe <- modExportKe[4]
kable(finalModelKe, format="markdown")
| Coefficient | 95% Confidence Interval | P-Value | |
|---|---|---|---|
| (Intercept) ph | 5.5000 | 5 to 5.9 | <0.001 *** |
| as.factor(d_client)1 ph | -0.0340 | -0.089 to 0.02 | 0.22 |
| as.factor(season)2016 ph | -0.3900 | -0.41 to -0.37 | <0.001 *** |
| (Intercept) calcium | 470.0000 | 15 to 920 | 0.043 * |
| as.factor(d_client)1 calcium | -66.0000 | -120 to -10 | 0.02 * |
| as.factor(season)2016 calcium | -50.0000 | -75 to -24 | <0.001 *** |
| as.factor(season)2017 calcium | -28.0000 | -54 to -1.7 | 0.037 * |
| (Intercept) total.nitrogen | 0.1100 | 0.087 to 0.12 | <0.001 *** |
| as.factor(d_client)1 total.nitrogen | -0.0028 | -0.005 to -0.00055 | 0.015 * |
| as.factor(season)2016 total.nitrogen | -0.0091 | -0.01 to -0.008 | <0.001 *** |
| as.factor(season)2017 total.nitrogen | -0.0110 | -0.012 to -0.0099 | <0.001 *** |
| (Intercept) organic.carbon | 1.2000 | 0.92 to 1.5 | <0.001 *** |
| as.factor(d_client)1 organic.carbon | 0.0110 | -0.026 to 0.048 | 0.55 |
| as.factor(season)2016 organic.carbon | -0.3100 | -0.33 to -0.29 | <0.001 *** |
| as.factor(season)2017 organic.carbon | -0.3100 | -0.32 to -0.29 | <0.001 *** |
| (Intercept) magnesium | 86.0000 | 14 to 160 | 0.019 * |
| as.factor(d_client)1 magnesium | -5.8000 | -14 to 2.5 | 0.17 |
| as.factor(season)2016 magnesium | -12.0000 | -15 to -8.3 | <0.001 *** |
write.csv(finalModelKe, file="output/ke2017/indFe_ke2017.csv")
The parallel model wasn’t running for some reason so I’m just going to run the one model I really want to look at and save those results.
rwDat <- rwDat %>% mutate(age2 = age^2)
rwDat <- do.call(data.frame,lapply(rwDat, function(x){
replace(x, is.infinite(x),NA)
}))
indFeList <- list("as.factor(d_client)",
c("as.factor(d_client)", "as.factor(sample_id)"),
c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)"),
c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)", "age", "age2"))
# run this in parallel to speed up the process
# load the data and variables and packages into the cluster
regFileRw <- "regFile_through2017b.RData"
#forceUpdate <- forceUpdateAll
if(!file.exists(regFileRw) || forceUpdate) {
library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores, type="FORK")
clusterEvalQ(cl, "plm")
clusterExport(cl, "rwDat")
clusterExport(cl, "keySoilVars")
clusterExport(cl, "indFeList")
indFeLoopRw <- parLapply(cl, indFeList, function(mod){
lapply(keySoilVars, function(outcome){
form = lm(reformulate(termlabels = mod, response = outcome), data=rwDat)
pdf(file=paste("output/rw2017b/", paste0(outcome, paste(mod, collapse = "")), ".pdf", sep = ""))
print(plot(form))
dev.off()
form = plm(form, c("sample_id", "age", "age2"))
rownames(form) = paste(rownames(form), outcome, sep = " ")
return(form)
})
})
stopCluster(cl)
save(indFeLoopRw, file=regFileRw)
} else {
load(regFileRw)
}
modExportRw <- lapply(indFeLoopRw, function(models){
do.call(rbind, models)
})
for(i in 1:length(modExportRw)){
write.csv(modExportRw[i], file=paste0("output/rw2017b/","regOutput_", i, ".csv"), row.names = T)
}
finalModelRw <- modExportRw[4]
kable(finalModelRw, format="markdown")
write.csv(finalModelRw, file="output/rw2017b/indFe_rw2017.csv")
Nothing to see here